{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 05.04 - PARTICIPATE IN KAGGLE"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py\n", "import init; init.init(force_download=False); init.get_weblink()"]}, {"cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": ["import numpy as np\n", "import matplotlib.pyplot as plt\n", "import tensorflow as tf\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import train_test_split\n", "import local.lib.mlutils\n", "import pandas as pd\n", "from tensorflow.keras import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout, Flatten\n", "%matplotlib inline"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## We use Titanic data in [Kaggle](http://www.kaggle.com)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["- Register to [Kaggle](http://www.kaggle.com)\n", "- Enter the competition [Titanic Data at Kaggle](https://www.kaggle.com/c/titanic)\n", "- Download the `train.csv` and `test.csv` files\n", "- **UPLOAD THE FILES** to your notebook environment (in Colab, open the Files tab and upload)"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["(891, 12)\n"]}], "source": ["d = pd.read_csv(\"train.csv\")\n", "print (d.shape)"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "
\n", " \n", " \n", " | \n", " PassengerId | \n", " Survived | \n", " Pclass | \n", " Name | \n", " Sex | \n", " Age | \n", " SibSp | \n", " Parch | \n", " Ticket | \n", " Fare | \n", " Cabin | \n", " Embarked | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 1 | \n", " 0 | \n", " 3 | \n", " Braund, Mr. Owen Harris | \n", " male | \n", " 22.0 | \n", " 1 | \n", " 0 | \n", " A/5 21171 | \n", " 7.2500 | \n", " NaN | \n", " S | \n", "
\n", " \n", " 1 | \n", " 2 | \n", " 1 | \n", " 1 | \n", " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", " female | \n", " 38.0 | \n", " 1 | \n", " 0 | \n", " PC 17599 | \n", " 71.2833 | \n", " C85 | \n", " C | \n", "
\n", " \n", " 2 | \n", " 3 | \n", " 1 | \n", " 3 | \n", " Heikkinen, Miss. Laina | \n", " female | \n", " 26.0 | \n", " 0 | \n", " 0 | \n", " STON/O2. 3101282 | \n", " 7.9250 | \n", " NaN | \n", " S | \n", "
\n", " \n", " 3 | \n", " 4 | \n", " 1 | \n", " 1 | \n", " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", " female | \n", " 35.0 | \n", " 1 | \n", " 0 | \n", " 113803 | \n", " 53.1000 | \n", " C123 | \n", " S | \n", "
\n", " \n", " 4 | \n", " 5 | \n", " 0 | \n", " 3 | \n", " Allen, Mr. William Henry | \n", " male | \n", " 35.0 | \n", " 0 | \n", " 0 | \n", " 373450 | \n", " 8.0500 | \n", " NaN | \n", " S | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S "]}, "execution_count": 51, "metadata": {}, "output_type": "execute_result"}], "source": ["d.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**Understand `NaN` values are present**"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": [" PassengerId 0\n", " Survived 0\n", " Pclass 0\n", " Name 0\n", " Sex 0\n", " Age 177\n", " SibSp 0\n", " Parch 0\n", " Ticket 0\n", " Fare 0\n", " Cabin 687\n", " Embarked 2\n"]}], "source": ["for i in d.columns:\n", " print (\"%20s\"%i, np.sum(d[i].isna()))"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["S 644\n", "C 168\n", "Q 77\n", "Name: Embarked, dtype: int64"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["d.Embarked.value_counts()"]}, {"cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [{"data": {"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAQH0lEQVR4nO3dXYxcZ33H8e+vCeHFUBKTreXGpE4VK1FUNQ6s0kQgBDGhgaDYFyhKhCqrsuQb2pKWijqtVAmpF45UAblASFYCuBXNCyGprSABqRtUtaoM6ySAEyd1CA448stC44YXCTD8ezHH8mq98c7uzuzMk3w/0mrmnJnZ+Wln/POzz55nTqoKSVJ7fmvUASRJi2OBS1KjLHBJapQFLkmNssAlqVHnLueTXXjhhbV27drlfEpJat6+fft+VFUTs/cva4GvXbuWqamp5XxKSWpekufn2u8UiiQ1ygKXpEZZ4JLUKAtckhplgUtSoyxwSWqUBS5JjbLAJalRFrgkNWpZV2JqtNZu+0pf9zu0/cYhJ5E0CI7AJalRFrgkNcoCl6RGWeCS1Kh5CzzJZUmemPH1UpLbkqxM8kiSg93lBcsRWJLUM2+BV9UzVbW+qtYDbwd+DjwEbAP2VNU6YE+3LUlaJgudQtkAfK+qngc2Aju7/TuBTYMMJkk6u4UW+C3APd31VVV1pLt+FFg11wOSbE0ylWRqenp6kTElSbP1XeBJzgNuAr40+7aqKqDmelxV7aiqyaqanJg445RukqRFWsgI/P3AY1V1rNs+lmQ1QHd5fNDhJEkvbyEFfiunp08AdgObu+ubgV2DCiVJml9fBZ5kBXA98OCM3duB65McBN7bbUuSlklfH2ZVVT8D3jJr34/pHZUiSRoBV2JKUqMscElqlAUuSY2ywCWpURa4JDXKApekRlngktQoC1ySGuVZ6V8B+j3b/Cif1zPdS4PnCFySGmWBS1KjLHBJapQFLkmNssAlqVEWuCQ1ygKXpEZZ4JLUKBfyaFn0u+jHBT9S/xyBS1Kj+j2p8flJHkjydJIDSa5NsjLJI0kOdpcXDDusJOm0fkfgdwJfrarLgSuBA8A2YE9VrQP2dNuSpGUyb4EneTPwLuBugKr6ZVWdADYCO7u77QQ2DSukJOlM/YzALwGmgc8neTzJXUlWAKuq6kh3n6PAqmGFlCSdqZ8CPxd4G/DZqroK+BmzpkuqqoCa68FJtiaZSjI1PT291LySpE4/BX4YOFxVe7vtB+gV+rEkqwG6y+NzPbiqdlTVZFVNTkxMDCKzJIk+CryqjgI/THJZt2sD8BSwG9jc7dsM7BpKQknSnPpdyPPnwBeTnAc8B/wpvfK/P8kW4Hng5uFElCTNpa8Cr6ongMk5btow2DiSpH65ElOSGmWBS1KjLHBJapQFLkmNssAlqVEWuCQ1ygKXpEZZ4JLUKAtckhplgUtSoyxwSWqUBS5JjbLAJalRFrgkNcoCl6RGWeCS1CgLXJIaZYFLUqMscElqlAUuSY3q66TGSQ4BPwF+DZysqskkK4H7gLXAIeDmqnpxODElSbMtZAT+nqpaX1Wnzk6/DdhTVeuAPd22JGmZLGUKZSOws7u+E9i09DiSpH71W+AFfD3JviRbu32rqupId/0osGquBybZmmQqydT09PQS40qSTulrDhx4Z1W9kOR3gEeSPD3zxqqqJDXXA6tqB7ADYHJycs77SJIWrq8ReFW90F0eBx4CrgaOJVkN0F0eH1ZISdKZ5i3wJCuSvOnUdeB9wH5gN7C5u9tmYNewQkqSztTPFMoq4KEkp+7/L1X11STfAu5PsgV4Hrh5eDElSbPNW+BV9Rxw5Rz7fwxsGEYoSdL8XIkpSY2ywCWpUf0eRiiNlbXbvtLX/Q5tv3HISaTRcQQuSY2ywCWpURa4JDXKApekRlngktQoC1ySGmWBS1KjLHBJapQFLkmNssAlqVEWuCQ1ygKXpEZZ4JLUKAtckhrlx8nqDP1+VKuk0XIELkmN6rvAk5yT5PEkD3fblyTZm+TZJPclOW94MSVJsy1kBP5R4MCM7TuAT1XVpcCLwJZBBpMknV1fBZ5kDXAjcFe3HeA64IHuLjuBTcMIKEmaW78j8E8DHwd+022/BThRVSe77cPARXM9MMnWJFNJpqanp5cUVpJ02rwFnuSDwPGq2reYJ6iqHVU1WVWTExMTi/kWkqQ59HMY4TuAm5J8AHgd8NvAncD5Sc7tRuFrgBeGF1OSNNu8BV5VtwO3AyR5N/DXVfXhJF8CPgTcC2wGdg0x56uSx2NLOpulHAf+N8BfJXmW3pz43YOJJEnqx4JWYlbVN4BvdNefA64efCRJUj9cSq9XtH6noQ5tv3HISaTBcym9JDXKApekRlngktQoC1ySGmWBS1KjLHBJapQFLkmNssAlqVEWuCQ1ygKXpEZZ4JLUKAtckhplgUtSo/w0Qo0VT2Ih9c8RuCQ1ygKXpEZZ4JLUKAtckhplgUtSo+Yt8CSvS/LNJN9O8mSST3T7L0myN8mzSe5Lct7w40qSTulnBP4L4LqquhJYD9yQ5BrgDuBTVXUp8CKwZXgxJUmzzVvg1fPTbvM13VcB1wEPdPt3ApuGklCSNKe+FvIkOQfYB1wKfAb4HnCiqk52dzkMXPQyj90KbAW4+OKLFx203wUeh7bfuOjnkKSW9PVHzKr6dVWtB9YAVwOX9/sEVbWjqiaranJiYmKRMSVJsy3oKJSqOgE8ClwLnJ/k1Ah+DfDCgLNJks5i3imUJBPAr6rqRJLXA9fT+wPmo8CHgHuBzcCuYQZ9JfHzPiQNQj9z4KuBnd08+G8B91fVw0meAu5N8g/A48DdQ8wpSZpl3gKvqu8AV82x/zl68+HSq8ZCfnvyD+oaNldiSlKjLHBJapQFLkmNssAlqVEWuCQ1ygKXpEZZ4JLUKAtckhplgUtSoyxwSWqUBS5JjbLAJalRFrgkNcoCl6RGWeCS1CgLXJIaZYFLUqMscElqlAUuSY3q56z0bwX+CVgFFLCjqu5MshK4D1gLHAJurqoXhxdVGp6FnOty0N/Tc2dqsfoZgZ8EPlZVVwDXAB9JcgWwDdhTVeuAPd22JGmZzFvgVXWkqh7rrv8EOABcBGwEdnZ32wlsGlZISdKZFjQHnmQtcBWwF1hVVUe6m47Sm2KZ6zFbk0wlmZqenl5CVEnSTH0XeJI3Al8Gbquql2beVlVFb378DFW1o6omq2pyYmJiSWElSaf1VeBJXkOvvL9YVQ92u48lWd3dvho4PpyIkqS5zFvgSQLcDRyoqk/OuGk3sLm7vhnYNfh4kqSXM+9hhMA7gD8BvpvkiW7f3wLbgfuTbAGeB24eTkRJ0lzmLfCq+k8gL3PzhsHGkST1y5WYktQoC1ySGmWBS1KjLHBJapQFLkmN6ucwQklD5KcWarEcgUtSoyxwSWqUUyjSK4xTMq8ejsAlqVEWuCQ16hU3hTLoXx8Xcq5EfyXVMA3jvJ1qmyNwSWqUBS5JjbLAJalRFrgkNcoCl6RGWeCS1CgLXJIa1c9Z6T+X5HiS/TP2rUzySJKD3eUFw40pSZqtnxH4F4AbZu3bBuypqnXAnm5bkrSM5i3wqvoP4H9n7d4I7Oyu7wQ2DTiXJGkei11Kv6qqjnTXjwKrXu6OSbYCWwEuvvjiRT7d4LksWVLrlvxHzKoqoM5y+46qmqyqyYmJiaU+nSSps9gCP5ZkNUB3eXxwkSRJ/VjsFMpuYDOwvbvcNbBEksaKJ4gYX/0cRngP8N/AZUkOJ9lCr7ivT3IQeG+3LUlaRvOOwKvq1pe5acOAs0iSFuAVd0IHSaPhyU+Wn0vpJalRFrgkNcopFOlVysVs7XMELkmNssAlqVEWuCQ1ygKXpEZZ4JLUKI9CkdS8V+vntTgCl6RGOQKXtOxerSPmQXMELkmNssAlqVFOoQyQS5MlLSdH4JLUKAtckhplgUtSoyxwSWqUBS5JjVrSUShJbgDuBM4B7qoqz04vaWwNegHRqBckLXoEnuQc4DPA+4ErgFuTXDGoYJKks1vKFMrVwLNV9VxV/RK4F9g4mFiSpPksZQrlIuCHM7YPA380+05JtgJbu82fJnlmgc9zIfCjRSUcvnHNZq6FGddcML7ZliVX7ljwQwaSaxHPO9/3W2qu35tr59BXYlbVDmDHYh+fZKqqJgcYaWDGNZu5FmZcc8H4ZjPXwgwr11KmUF4A3jpje023T5K0DJZS4N8C1iW5JMl5wC3A7sHEkiTNZ9FTKFV1MsmfAV+jdxjh56rqyYElO23R0y/LYFyzmWthxjUXjG82cy3MUHKlqobxfSVJQ+ZKTElqlAUuSY0a6wJPckOSZ5I8m2TbCHN8LsnxJPtn7FuZ5JEkB7vLC0aQ661JHk3yVJInk3x0jLK9Lsk3k3y7y/aJbv8lSfZ2r+l93R/Al12Sc5I8nuThccmV5FCS7yZ5IslUt28cXsvzkzyQ5OkkB5JcOya5Lut+Vqe+Xkpy25hk+8vufb8/yT3dv4eBv8fGtsDHbKn+F4AbZu3bBuypqnXAnm57uZ0EPlZVVwDXAB/pfkbjkO0XwHVVdSWwHrghyTXAHcCnqupS4EVgywiyAXwUODBje1xyvaeq1s84ZngcXss7ga9W1eXAlfR+biPPVVXPdD+r9cDbgZ8DD406W5KLgL8AJqvqD+gd5HELw3iPVdVYfgHXAl+bsX07cPsI86wF9s/YfgZY3V1fDTwzBj+zXcD145YNeAPwGL2Vuj8Czp3rNV7GPGvo/cO+DngYyJjkOgRcOGvfSF9L4M3A9+kOeBiXXHPkfB/wX+OQjdOr1FfSO9LvYeCPh/EeG9sROHMv1b9oRFnmsqqqjnTXjwKrRhkmyVrgKmAvY5Ktm6Z4AjgOPAJ8DzhRVSe7u4zqNf008HHgN932W8YkVwFfT7Kv+wgKGP1reQkwDXy+m3K6K8mKMcg12y3APd31kWarqheAfwR+ABwB/g/YxxDeY+Nc4M2o3n+pIzseM8kbgS8Dt1XVSzNvG2W2qvp19X69XUPvw88uH0WOmZJ8EDheVftGnWUO76yqt9GbNvxIknfNvHFEr+W5wNuAz1bVVcDPmDUlMQbv//OAm4Avzb5tFNm6OfeN9P7z+11gBWdOwQ7EOBf4uC/VP5ZkNUB3eXwUIZK8hl55f7GqHhynbKdU1QngUXq/Np6f5NQCslG8pu8AbkpyiN4naF5Hb4531LlOjdyoquP05nKvZvSv5WHgcFXt7bYfoFfoo8410/uBx6rqWLc96mzvBb5fVdNV9SvgQXrvu4G/x8a5wMd9qf5uYHN3fTO9+edllSTA3cCBqvrkmGWbSHJ+d/319ObmD9Ar8g+NKltV3V5Va6pqLb331L9X1YdHnSvJiiRvOnWd3pzufkb8WlbVUeCHSS7rdm0Anhp1rllu5fT0CYw+2w+Aa5K8ofs3eupnNvj32Cj/8NDHHwM+APwPvbnTvxthjnvozWX9it6IZAu9edM9wEHg34CVI8j1Tnq/Hn4HeKL7+sCYZPtD4PEu237g77v9vw98E3iW3q+8rx3h6/pu4OFxyNU9/7e7rydPvd/H5LVcD0x1r+W/AheMQ64u2wrgx8CbZ+wbeTbgE8DT3Xv/n4HXDuM95lJ6SWrUOE+hSJLOwgKXpEZZ4JLUKAtckhplgUtSoyxwSWqUBS5Jjfp/Y0mBgVAYKmAAAAAASUVORK5CYII=\n", "text/plain": [""]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["plt.hist(d.Age.dropna().values, bins=30);"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**Remove uninformative columns**"]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": ["del(d[\"PassengerId\"])\n", "del(d[\"Name\"])\n", "del(d[\"Ticket\"])\n", "del(d[\"Cabin\"])"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": []}, {"cell_type": "markdown", "metadata": {}, "source": ["**Fix `NaN` values**\n", "\n", "- observe the different filling policies we decide to have"]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " Survived | \n", " Pclass | \n", " Sex | \n", " Age | \n", " SibSp | \n", " Parch | \n", " Fare | \n", " Embarked | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 0 | \n", " 3 | \n", " male | \n", " 22.0 | \n", " 1 | \n", " 0 | \n", " 7.2500 | \n", " S | \n", "
\n", " \n", " 1 | \n", " 1 | \n", " 1 | \n", " female | \n", " 38.0 | \n", " 1 | \n", " 0 | \n", " 71.2833 | \n", " C | \n", "
\n", " \n", " 2 | \n", " 1 | \n", " 3 | \n", " female | \n", " 26.0 | \n", " 0 | \n", " 0 | \n", " 7.9250 | \n", " S | \n", "
\n", " \n", " 3 | \n", " 1 | \n", " 1 | \n", " female | \n", " 35.0 | \n", " 1 | \n", " 0 | \n", " 53.1000 | \n", " S | \n", "
\n", " \n", " 4 | \n", " 0 | \n", " 3 | \n", " male | \n", " 35.0 | \n", " 0 | \n", " 0 | \n", " 8.0500 | \n", " S | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" Survived Pclass Sex Age SibSp Parch Fare Embarked\n", "0 0 3 male 22.0 1 0 7.2500 S\n", "1 1 1 female 38.0 1 0 71.2833 C\n", "2 1 3 female 26.0 0 0 7.9250 S\n", "3 1 1 female 35.0 1 0 53.1000 S\n", "4 0 3 male 35.0 0 0 8.0500 S"]}, "execution_count": 56, "metadata": {}, "output_type": "execute_result"}], "source": ["d[\"Embarked\"] = d.Embarked.fillna(\"N\")\n", "d[\"Age\"] = d.Age.fillna(d.Age.mean())\n", "d.head()"]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [{"data": {"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAPp0lEQVR4nO3dfYwcd33H8fenCU1LQCSpr5axnTptXZBbFSc9pUFBVSAteaDCQaqiRBVYNJL5w6hJhVQ5VCqgKlKQeGiR2kiGpIQKAimExgoRENxIiEoknIMJfsCNSxxiy7HNY6BIFIdv/9gx2Tjn3MPe3u79/H5Jq535zczO927mPjv725m5VBWSpLb8yqgLkCQtPMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBM4Z7ktVJHkyyJ8nuJDd27e9OcijJzu5xdd8yNyfZn2RfkiuG+QNIkp4vM53nnmQFsKKqHknyUmAHcA1wLfCTqnrfSfOvA+4CLgZeDnwJ+L2qemYI9UuSpjHjkXtVHa6qR7rhHwN7gZUvsMgG4JNV9bOqehzYTy/oJUmL5My5zJxkDXAh8BBwKfD2JG8BpoB3VNUP6AX/V/sWO8gLvxmwbNmyWrNmzVxKkaTT3o4dO75bVRPTTZt1uCd5CfAZ4KaqejrJbcA/ANU9vx/4qzm83iZgE8D555/P1NTUbBeVJAFJnjjVtFmdLZPkRfSC/eNVdQ9AVR2pqmeq6hfAh3m26+UQsLpv8VVd23NU1daqmqyqyYmJad94JEnzNJuzZQLcDuytqg/0ta/om+1NwK5ueBtwXZKzklwArAUeXriSJUkzmU23zKXAm4FvJtnZtb0TuD7JenrdMgeAtwFU1e4kdwN7gOPAZs+UkaTFNWO4V9VXgEwz6f4XWOYW4JYB6pIkDcArVCWpQYa7JDXIcJekBhnuktQgw12SGjSn2w9IC23Nls/Nar4Dt75hyJVIbfHIXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQTOGe5LVSR5MsifJ7iQ3du3nJXkgyWPd87lde5J8KMn+JI8muWjYP4Qk6blmc+R+HHhHVa0DLgE2J1kHbAG2V9VaYHs3DnAVsLZ7bAJuW/CqJUkvaMZwr6rDVfVIN/xjYC+wEtgA3NnNdidwTTe8AfhY9XwVOCfJigWvXJJ0SnPqc0+yBrgQeAhYXlWHu0lPAcu74ZXAk32LHezaJEmLZNbhnuQlwGeAm6rq6f5pVVVAzWXFSTYlmUoydezYsbksKkmawazCPcmL6AX7x6vqnq75yInulu75aNd+CFjdt/iqru05qmprVU1W1eTExMR865ckTWM2Z8sEuB3YW1Uf6Ju0DdjYDW8E7u1rf0t31swlwI/6um8kSYvgzFnMcynwZuCbSXZ2be8EbgXuTnID8ARwbTftfuBqYD/wU+CtC1qxJGlGM4Z7VX0FyCkmXz7N/AVsHrAuSdIAvEJVkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoNmDPckdyQ5mmRXX9u7kxxKsrN7XN037eYk+5PsS3LFsAqXJJ3abI7cPwpcOU37B6tqffe4HyDJOuA64Pe7Zf4lyRkLVawkaXZmDPeq+jLw/Vm+3gbgk1X1s6p6HNgPXDxAfZKkeRikz/3tSR7tum3O7dpWAk/2zXOwa5MkLaL5hvttwO8A64HDwPvn+gJJNiWZSjJ17NixeZYhSZrOvMK9qo5U1TNV9Qvgwzzb9XIIWN0366qubbrX2FpVk1U1OTExMZ8yJEmnMK9wT7Kib/RNwIkzabYB1yU5K8kFwFrg4cFKlCTN1ZkzzZDkLuAyYFmSg8C7gMuSrAcKOAC8DaCqdie5G9gDHAc2V9UzwyldknQqM4Z7VV0/TfPtLzD/LcAtgxQlSRqMV6hKUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJatCM4Z7kjiRHk+zqazsvyQNJHuuez+3ak+RDSfYneTTJRcMsXpI0vdkcuX8UuPKkti3A9qpaC2zvxgGuAtZ2j03AbQtTpiRpLmYM96r6MvD9k5o3AHd2w3cC1/S1f6x6vgqck2TFQhUrSZqd+fa5L6+qw93wU8Dybngl8GTffAe7NknSIhr4C9WqKqDmulySTUmmkkwdO3Zs0DIkSX3mG+5HTnS3dM9Hu/ZDwOq++VZ1bc9TVVurarKqJicmJuZZhiRpOvMN923Axm54I3BvX/tburNmLgF+1Nd9I0laJGfONEOSu4DLgGVJDgLvAm4F7k5yA/AEcG03+/3A1cB+4KfAW4dQsyRpBjOGe1Vdf4pJl08zbwGbBy1KkjQYr1CVpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGjTj/1DV6WHNls/Nar4Dt75hyJVIWggeuUtSgwx3SWqQ4S5JDTLcJalBhrskNcizZRo22zNgWlu3JI/cJalJhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0a6CKmJAeAHwPPAMerajLJecCngDXAAeDaqvrBYGVKkuZiIY7cX1tV66tqshvfAmyvqrXA9m5ckrSIhtEtswG4sxu+E7hmCOuQJL2AQcO9gC8m2ZFkU9e2vKoOd8NPAcsHXIckaY4GvXHYa6rqUJLfBB5I8q3+iVVVSWq6Bbs3g00A559//oBlSJL6DXTkXlWHuuejwGeBi4EjSVYAdM9HT7Hs1qqarKrJiYmJQcqQJJ1k3uGe5OwkLz0xDLwe2AVsAzZ2s20E7h20SEnS3AzSLbMc+GySE6/ziar6fJKvAXcnuQF4Arh28DKl2ZntfeQP3PqGIVcijda8w72qvg28apr27wGXD1KUJGkwXqEqSQ0y3CWpQf4PVc2J/xtVWho8cpekBhnuktQgu2WWILtGJM3EcNdpaS5vkJ4Tr6XIbhlJapDhLkkNMtwlqUH2uWtJ8EtkaW48cpekBhnuktSgJd8t4yltkvR8Sz7cW2K/sqSFYrhLC8R/FKJxYp+7JDXIcJekBhnuktQgw12SGmS4S1KDPFtGmsFCn6LqWTVaDB65S1KDDHdJapDhLkkNss9dGlP2zWsQhrt0mvDN4vRyWoX7Qu/c/rFoHHjDOU3HPndJapDhLkkNOq26ZWbLj7mSljrDXdK8+b3T+DLcJQ2dbwKLz3CX9Bx2S7bBcJfUrNP5E8PQwj3JlcA/AWcAH6mqW4e1LkltOJ3DeKENJdyTnAH8M/BnwEHga0m2VdWeYaxv3PkxV9JiG9Z57hcD+6vq21X1f8AngQ1DWpck6STD6pZZCTzZN34Q+OMhrUuSBjKM7qBRdzGN7AvVJJuATd3oT5Lsm8fLLAO+u3BVLRjrmrtxrc265mZR6sp757XYwLXNc70zveYgdf3WqSYMK9wPAav7xld1bb9UVVuBrYOsJMlUVU0O8hrDYF1zN661WdfcjGtdML61DauuYfW5fw1Ym+SCJL8KXAdsG9K6JEknGcqRe1UdT/J24Av0ToW8o6p2D2NdkqTnG1qfe1XdD9w/rNfvDNStM0TWNXfjWpt1zc241gXjW9tQ6kpVDeN1JUkj5P3cJalBSzLck1yZZF+S/Um2jLiWO5IcTbKrr+28JA8keax7PncEda1O8mCSPUl2J7lxHGpL8mtJHk7yja6u93TtFyR5qNumn+q+iF90Sc5I8vUk941ZXQeSfDPJziRTXds47GfnJPl0km8l2Zvk1aOuK8krut/TicfTSW4adV1dbX/T7fe7ktzV/T0MZR9bcuHed2uDq4B1wPVJ1o2wpI8CV57UtgXYXlVrge3d+GI7DryjqtYBlwCbu9/TqGv7GfC6qnoVsB64MsklwHuBD1bV7wI/AG5Y5LpOuBHY2zc+LnUBvLaq1vedNjfqbQm9+0d9vqpeCbyK3u9upHVV1b7u97Qe+CPgp8BnR11XkpXAXwOTVfUH9E42uY5h7WNVtaQewKuBL/SN3wzcPOKa1gC7+sb3ASu64RXAvjH4vd1L714/Y1Mb8GLgEXpXL38XOHO6bbyI9ayi90f/OuA+IONQV7fuA8Cyk9pGui2BlwGP0313Ny51nVTL64H/Goe6ePbK/fPoncxyH3DFsPaxJXfkzvS3Nlg5olpOZXlVHe6GnwKWj7KYJGuAC4GHGIPauq6PncBR4AHgf4AfVtXxbpZRbdN/BP4W+EU3/htjUhdAAV9MsqO7uhtGvy0vAI4B/9p1ZX0kydljUFe/64C7uuGR1lVVh4D3Ad8BDgM/AnYwpH1sKYb7klK9t+ORnZKU5CXAZ4Cbqurp/mmjqq2qnqneR+ZV9G4y98rFruFkSf4cOFpVO0Zdyym8pqouotcduTnJn/RPHNG2PBO4CLitqi4E/peTujpGuf93fddvBP795GmjqKvr499A703x5cDZPL9Ld8EsxXCf8dYGY+BIkhUA3fPRURSR5EX0gv3jVXXPONUGUFU/BB6k91H0nCQnrrsYxTa9FHhjkgP07mL6Onr9yaOuC/jlUR9VdZRe//HFjH5bHgQOVtVD3fin6YX9qOs64Srgkao60o2Puq4/BR6vqmNV9XPgHnr73VD2saUY7kvh1gbbgI3d8EZ6/d2LKkmA24G9VfWBcaktyUSSc7rhX6f3PcBeeiH/F6Oqq6purqpVVbWG3j71n1X1l6OuCyDJ2UleemKYXj/yLka8LavqKeDJJK/omi4H9oy6rj7X82yXDIy+ru8AlyR5cff3eeL3NZx9bFRfdAz4xcTVwH/T66v9uxHXche9/rOf0zuSuYFeX+124DHgS8B5I6jrNfQ+dj4K7OweV4+6NuAPga93de0C/r5r/23gYWA/vY/RZ41wm14G3DcudXU1fKN77D6xz496W3Y1rAemuu35H8C5Y1LX2cD3gJf1tY1DXe8BvtXt+/8GnDWsfcwrVCWpQUuxW0aSNAPDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBv0/tUOT2W1OBfcAAAAASUVORK5CYII=\n", "text/plain": [""]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["plt.hist(d.Age.dropna().values, bins=30);"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**Turn categorical columns to a `one_hot` encoding**"]}, {"cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([[0, 0, 0, 1],\n", " [1, 0, 0, 0],\n", " [0, 0, 0, 1],\n", " [0, 0, 0, 1],\n", " [0, 0, 0, 1]])"]}, "execution_count": 58, "metadata": {}, "output_type": "execute_result"}], "source": ["def to_onehot(x):\n", " values = np.unique(x)\n", " r = np.r_[[np.argwhere(i==values)[0][0] for i in x]]\n", " return np.eye(len(values))[r].astype(int)\n", " \n", "k = to_onehot(d.Embarked.values)\n", "k[:5]"]}, {"cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": ["def replace_columns_with_onehot(d, col):\n", " k = to_onehot(d[col].values)\n", " r = pd.DataFrame(k, columns=[\"%s_%d\"%(col, i) for i in range(k.shape[1])], index=d.index).join(d)\n", " del(r[col])\n", " return r "]}, {"cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " Survived | \n", " Pclass | \n", " Sex | \n", " Age | \n", " SibSp | \n", " Parch | \n", " Fare | \n", " Embarked | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 0 | \n", " 3 | \n", " male | \n", " 22.0 | \n", " 1 | \n", " 0 | \n", " 7.2500 | \n", " S | \n", "
\n", " \n", " 1 | \n", " 1 | \n", " 1 | \n", " female | \n", " 38.0 | \n", " 1 | \n", " 0 | \n", " 71.2833 | \n", " C | \n", "
\n", " \n", " 2 | \n", " 1 | \n", " 3 | \n", " female | \n", " 26.0 | \n", " 0 | \n", " 0 | \n", " 7.9250 | \n", " S | \n", "
\n", " \n", " 3 | \n", " 1 | \n", " 1 | \n", " female | \n", " 35.0 | \n", " 1 | \n", " 0 | \n", " 53.1000 | \n", " S | \n", "
\n", " \n", " 4 | \n", " 0 | \n", " 3 | \n", " male | \n", " 35.0 | \n", " 0 | \n", " 0 | \n", " 8.0500 | \n", " S | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" Survived Pclass Sex Age SibSp Parch Fare Embarked\n", "0 0 3 male 22.0 1 0 7.2500 S\n", "1 1 1 female 38.0 1 0 71.2833 C\n", "2 1 3 female 26.0 0 0 7.9250 S\n", "3 1 1 female 35.0 1 0 53.1000 S\n", "4 0 3 male 35.0 0 0 8.0500 S"]}, "execution_count": 60, "metadata": {}, "output_type": "execute_result"}], "source": ["d.head()"]}, {"cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " Embarked_0 | \n", " Embarked_1 | \n", " Embarked_2 | \n", " Embarked_3 | \n", " Survived | \n", " Pclass | \n", " Sex | \n", " Age | \n", " SibSp | \n", " Parch | \n", " Fare | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 3 | \n", " male | \n", " 22.0 | \n", " 1 | \n", " 0 | \n", " 7.2500 | \n", "
\n", " \n", " 1 | \n", " 1 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 1 | \n", " female | \n", " 38.0 | \n", " 1 | \n", " 0 | \n", " 71.2833 | \n", "
\n", " \n", " 2 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 1 | \n", " 3 | \n", " female | \n", " 26.0 | \n", " 0 | \n", " 0 | \n", " 7.9250 | \n", "
\n", " \n", " 3 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 1 | \n", " 1 | \n", " female | \n", " 35.0 | \n", " 1 | \n", " 0 | \n", " 53.1000 | \n", "
\n", " \n", " 4 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 3 | \n", " male | \n", " 35.0 | \n", " 0 | \n", " 0 | \n", " 8.0500 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" Embarked_0 Embarked_1 Embarked_2 Embarked_3 Survived Pclass Sex \\\n", "0 0 0 0 1 0 3 male \n", "1 1 0 0 0 1 1 female \n", "2 0 0 0 1 1 3 female \n", "3 0 0 0 1 1 1 female \n", "4 0 0 0 1 0 3 male \n", "\n", " Age SibSp Parch Fare \n", "0 22.0 1 0 7.2500 \n", "1 38.0 1 0 71.2833 \n", "2 26.0 0 0 7.9250 \n", "3 35.0 1 0 53.1000 \n", "4 35.0 0 0 8.0500 "]}, "execution_count": 61, "metadata": {}, "output_type": "execute_result"}], "source": ["d = replace_columns_with_onehot(d, \"Embarked\")\n", "d.head()"]}, {"cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " Sex_0 | \n", " Sex_1 | \n", " Embarked_0 | \n", " Embarked_1 | \n", " Embarked_2 | \n", " Embarked_3 | \n", " Survived | \n", " Pclass | \n", " Age | \n", " SibSp | \n", " Parch | \n", " Fare | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 3 | \n", " 22.0 | \n", " 1 | \n", " 0 | \n", " 7.2500 | \n", "
\n", " \n", " 1 | \n", " 1 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 1 | \n", " 38.0 | \n", " 1 | \n", " 0 | \n", " 71.2833 | \n", "
\n", " \n", " 2 | \n", " 1 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 1 | \n", " 3 | \n", " 26.0 | \n", " 0 | \n", " 0 | \n", " 7.9250 | \n", "
\n", " \n", " 3 | \n", " 1 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 1 | \n", " 1 | \n", " 35.0 | \n", " 1 | \n", " 0 | \n", " 53.1000 | \n", "
\n", " \n", " 4 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 0 | \n", " 0 | \n", " 1 | \n", " 0 | \n", " 3 | \n", " 35.0 | \n", " 0 | \n", " 0 | \n", " 8.0500 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" Sex_0 Sex_1 Embarked_0 Embarked_1 Embarked_2 Embarked_3 Survived \\\n", "0 0 1 0 0 0 1 0 \n", "1 1 0 1 0 0 0 1 \n", "2 1 0 0 0 0 1 1 \n", "3 1 0 0 0 0 1 1 \n", "4 0 1 0 0 0 1 0 \n", "\n", " Pclass Age SibSp Parch Fare \n", "0 3 22.0 1 0 7.2500 \n", "1 1 38.0 1 0 71.2833 \n", "2 3 26.0 0 0 7.9250 \n", "3 1 35.0 1 0 53.1000 \n", "4 3 35.0 0 0 8.0500 "]}, "execution_count": 62, "metadata": {}, "output_type": "execute_result"}], "source": ["d = replace_columns_with_onehot(d, \"Sex\")\n", "d.head()"]}, {"cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [{"data": {"text/plain": ["((891, 12), 60142.86312352941)"]}, "execution_count": 63, "metadata": {}, "output_type": "execute_result"}], "source": ["d.shape, d.values.sum()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Put all transformations together"]}, {"cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": ["def clean_titanic(d):\n", " del(d[\"PassengerId\"])\n", " del(d[\"Name\"])\n", " del(d[\"Ticket\"])\n", " del(d[\"Cabin\"])\n", " d[\"Embarked\"] = d.Embarked.fillna(\"N\")\n", " d[\"Fare\"] = d.Fare.fillna(d.Fare.mean())\n", " d[\"Age\"] = d.Age.fillna(d.Age.mean())\n", " d = replace_columns_with_onehot(d, \"Embarked\")\n", " d = replace_columns_with_onehot(d, \"Sex\")\n", " return d"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**transform train and test data together**\n", "\n", "- observe that test data **does not have** a `Survival` column. This is the result to submit to Kaggle"]}, {"cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [{"data": {"text/plain": ["((891, 12), (418, 11))"]}, "execution_count": 67, "metadata": {}, "output_type": "execute_result"}], "source": ["dtr = pd.read_csv(\"train.csv\")\n", "dts = pd.read_csv(\"test.csv\")\n", "lentr = len(dtr)\n", "dtr.shape, dts.shape"]}, {"cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " PassengerId | \n", " Pclass | \n", " Name | \n", " Sex | \n", " Age | \n", " SibSp | \n", " Parch | \n", " Ticket | \n", " Fare | \n", " Cabin | \n", " Embarked | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 892 | \n", " 3 | \n", " Kelly, Mr. James | \n", " male | \n", " 34.5 | \n", " 0 | \n", " 0 | \n", " 330911 | \n", " 7.8292 | \n", " NaN | \n", " Q | \n", "
\n", " \n", " 1 | \n", " 893 | \n", " 3 | \n", " Wilkes, Mrs. James (Ellen Needs) | \n", " female | \n", " 47.0 | \n", " 1 | \n", " 0 | \n", " 363272 | \n", " 7.0000 | \n", " NaN | \n", " S | \n", "
\n", " \n", " 2 | \n", " 894 | \n", " 2 | \n", " Myles, Mr. Thomas Francis | \n", " male | \n", " 62.0 | \n", " 0 | \n", " 0 | \n", " 240276 | \n", " 9.6875 | \n", " NaN | \n", " Q | \n", "
\n", " \n", " 3 | \n", " 895 | \n", " 3 | \n", " Wirz, Mr. Albert | \n", " male | \n", " 27.0 | \n", " 0 | \n", " 0 | \n", " 315154 | \n", " 8.6625 | \n", " NaN | \n", " S | \n", "
\n", " \n", " 4 | \n", " 896 | \n", " 3 | \n", " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n", " female | \n", " 22.0 | \n", " 1 | \n", " 1 | \n", " 3101298 | \n", " 12.2875 | \n", " NaN | \n", " S | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" PassengerId Pclass Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S "]}, "execution_count": 68, "metadata": {}, "output_type": "execute_result"}], "source": ["dts.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**get data ready for training**"]}, {"cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["(891, 11) (891,)\n", "(418, 11)\n"]}], "source": ["source_cols = [i for i in dtr.columns if i!=\"Survived\"]\n", "all_data = pd.concat((dtr[source_cols], dts[source_cols]))\n", "all_data.index = range(len(all_data))\n", "all_data = clean_titanic(all_data)\n", "\n", "Xtr, ytr = all_data.iloc[:lentr].values, dtr[\"Survived\"].values\n", "Xts = all_data.iloc[lentr:].values\n", "\n", "print (Xtr.shape, ytr.shape)\n", "print (Xts.shape)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**cross validate for model selection**"]}, {"cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["[0.77094972 0.81460674 0.84831461 0.79213483 0.83707865]\n", "[0.59217877 0.71348315 0.69101124 0.68539326 0.69101124]\n"]}], "source": ["from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.model_selection import cross_val_score\n", "rf = RandomForestClassifier()\n", "print (cross_val_score(rf, Xtr, ytr))\n", "\n", "svc = SVC()\n", "print (cross_val_score(svc, Xtr, ytr))"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**now train with full dataset and generate submission for Kaggle**"]}, {"cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,\n", " 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,\n", " 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,\n", " 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,\n", " 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,\n", " 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,\n", " 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n", " 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n", " 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,\n", " 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,\n", " 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,\n", " 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,\n", " 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,\n", " 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,\n", " 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1])"]}, "execution_count": 73, "metadata": {}, "output_type": "execute_result"}], "source": ["rf.fit(Xtr, ytr)\n", "preds_ts = rf.predict(Xts)\n", "preds_ts"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**get predictions ready to submit to Kaggle**\n", "\n", "- see https://www.kaggle.com/c/titanic#evaluation for file format"]}, {"cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " PassengerId | \n", " Survived | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 892 | \n", " 0 | \n", "
\n", " \n", " 1 | \n", " 893 | \n", " 0 | \n", "
\n", " \n", " 2 | \n", " 894 | \n", " 0 | \n", "
\n", " \n", " 3 | \n", " 895 | \n", " 1 | \n", "
\n", " \n", " 4 | \n", " 896 | \n", " 0 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" PassengerId Survived\n", "0 892 0\n", "1 893 0\n", "2 894 0\n", "3 895 1\n", "4 896 0"]}, "execution_count": 74, "metadata": {}, "output_type": "execute_result"}], "source": ["submission = pd.DataFrame([dts.PassengerId, pd.Series(preds_ts, name=\"Survived\")]).T\n", "submission.head()"]}, {"cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": ["submission.to_csv(\"titanic_kaggle.csv\", index=False)"]}, {"cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["PassengerId,Survived\r\n", "892,0\r\n", "893,0\r\n", "894,0\r\n", "895,1\r\n", "896,0\r\n", "897,0\r\n", "898,0\r\n", "899,0\r\n", "900,1\r\n"]}], "source": ["!head titanic_kaggle.csv"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": []}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "p37", "language": "python", "name": "p37"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6"}}, "nbformat": 4, "nbformat_minor": 2}